Chapter 3 Exploratory Analysis
library(feather)
library(skimr)
library(tidyverse)
library(DataExplorer)
trans <- read_feather("data/transactions.feather")3.1 Response Variable
log_error something something text
skim(trans) %>%
skimr::pander()Skim summary statistics
n obs: 167888
n variables: 12
| variable | missing | complete | n | min | max |
|---|---|---|---|---|---|
| date | 0 | 167888 | 167888 | 2016-01-01 | 2017-09-25 |
| month_year | 0 | 167888 | 167888 | 2016-01-01 | 2017-09-01 |
| week | 0 | 167888 | 167888 | 2015-12-27 | 2017-09-24 |
| median | n_unique |
|---|---|
| 2016-10-11 | 616 |
| 2016-10-01 | 21 |
| 2016-10-09 | 92 |
| variable | missing | complete | n | n_unique |
|---|---|---|---|---|
| month | 0 | 167888 | 167888 | 12 |
| wday | 0 | 167888 | 167888 | 7 |
| top_counts | ordered |
|---|---|
| Jun: 22378, May: 20448, Aug: 20412, Jul: 19437 | FALSE |
| Fri: 44914, Thu: 34143, Wed: 32692, Tue: 31404 | FALSE |
| variable | missing | complete | n | mean | sd | p0 |
|---|---|---|---|---|---|---|
| day_of_month | 0 | 167888 | 167888 | 16.43 | 8.99 | 1 |
| id_parcel | 0 | 167888 | 167888 | 1.3e+07 | 3e+06 | 1.1e+07 |
| p25 | p50 | p75 | p100 |
|---|---|---|---|
| 9 | 16 | 24 | 31 |
| 1.2e+07 | 1.3e+07 | 1.4e+07 | 1.7e+08 |
| variable | missing | complete | n | mean | sd | p0 |
|---|---|---|---|---|---|---|
| abs_log_error | 0 | 167888 | 167888 | 0.069 | 0.15 | 0 |
| log_error | 0 | 167888 | 167888 | 0.014 | 0.17 | -4.66 |
| week_of_year | 0 | 167888 | 167888 | 22.15 | 11.5 | 1 |
| week_since_start | 0 | 167888 | 167888 | 46.31 | 26.84 | 1 |
| year | 0 | 167888 | 167888 | 2016.46 | 0.5 | 2016 |
| p25 | p50 | p75 | p100 |
|---|---|---|---|
| 0.014 | 0.032 | 0.069 | 5.26 |
| -0.025 | 0.006 | 0.039 | 5.26 |
| 13 | 22 | 31 | 53 |
| 23 | 41 | 72 | 91 |
| 2016 | 2016 | 2017 | 2017 |
trans %>%
ggplot(aes(x = log_error)) +
geom_histogram(bins=400, fill = "red", alpha = 0.5) +
theme_bw()
Figure 3.1: Distribution of Log Error
trans %>%
filter(
log_error > quantile(log_error, probs = c(.05)),
log_error < quantile(log_error, probs = c(.95))
) %>%
ggplot(aes(x = log_error)) +
geom_histogram(fill = "red", alpha = 0.5) +
theme_bw()
Figure 3.2: Distribution of Log Error Between 5 and 95 Percentile
trans %>%
filter(
log_error > quantile(abs_log_error, probs = c(.05)),
log_error < quantile(abs_log_error, probs = c(.95))
) %>%
ggplot(aes(x = abs_log_error)) +
geom_histogram(fill = "red", alpha = 0.5) +
theme_bw()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Figure 3.3: Distribution of Absolute value of Log Error Between 5 and 95 Percentile
trans %>%
group_by(month_year) %>%
summarise(mean_log_error = mean(log_error)) %>%
ggplot(aes(x = month_year, y = mean_log_error)) +
geom_line(size = 1, colour = "red") +
geom_point(size = 3, colour = "red") +
theme_bw()
Figure 3.4: Average Log Error by Month
trans %>%
group_by(month_year, year, month) %>%
summarise(mean_log_error = mean(log_error)) %>%
ungroup() %>%
ggplot(aes(x = as.numeric(month), y = mean_log_error)) +
geom_path(aes(colour = as.factor(year)), size = 1) +
theme_bw() +
ylim(c(0, .03)) +
scale_x_continuous(breaks = 1:12, labels = levels(trans$month)) +
labs(
colour = NULL,
x = "month"
)
Figure 3.5: Average Log Error by Month. 2017 Looks to have a higher baseline
trans %>%
group_by(week_since_start) %>%
summarise(mean_log_error = mean(log_error)) %>%
ggplot(aes(x = week_since_start, y = mean_log_error)) +
geom_line(colour = "red", size = 1) +
geom_smooth() +
theme_bw()
Figure 3.6: Average Log Error by Week
3.1.1 Transactions Over Time
trans %>%
group_by(week_since_start) %>%
summarise(n = n()) %>%
ggplot(aes(x = week_since_start, y = n)) +
geom_line(colour = "red", size = 1) +
theme_bw() +
labs(
y = "Numeber of Transactions"
)
Figure 3.7: Number of Transactions per Week. The dip in the middle coorisponds to the hold out testing data
trans %>%
group_by(wday) %>%
count() %>%
ggplot(aes(x = wday, y = n)) +
geom_bar(stat = "identity", fill = "red", alpha = 0.5) +
theme_bw()
Figure 3.8: Number of Transactions by Day of the Week.
3.1.2 Map
library(leaflet)
library(leaflet.extras)
read_feather("data/properties_geo_only.feather") %>%
right_join(trans, by = "id_parcel") %>%
filter(
!is.na(lat)
) %>%
leaflet() %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addHeatmap(lng=~lon, lat=~lat, intensity = ~log_error,
radius = 5, minOpacity = 0.2, cellSize = 6)Figure 3.9: Distribution of Log Errors
3.2 Predictor Variables
Now lets take a look at the properties dataset
properties <- read_feather("data/properties_17.feather")skim(properties) %>%
skimr::pander()## Warning: Skimr's histograms incorrectly render with pander on Windows.
## Removing them. Use kable() if you'd like them rendered.
Skim summary statistics
n obs: 2985217
n variables: 58
| variable | missing | complete | n | min | max | empty | n_unique |
|---|---|---|---|---|---|---|---|
| fips | 2932 | 2982285 | 2985217 | 5 | 5 | 0 | 3 |
| pooltypeid10 | 2968211 | 17006 | 2985217 | 1 | 1 | 0 | 1 |
| pooltypeid2 | 2952161 | 33056 | 2985217 | 1 | 1 | 0 | 1 |
| rawcensustractandblock | 2932 | 2982285 | 2985217 | 12 | 16 | 0 | 100529 |
| str_arch_style | 2979156 | 6061 | 2985217 | 1 | 2 | 0 | 8 |
| str_material | 2978471 | 6746 | 2985217 | 1 | 2 | 0 | 5 |
| zoning_landuse_county | 2999 | 2982218 | 2985217 | 1 | 4 | 0 | 234 |
| zoning_property | 1002746 | 1982471 | 2985217 | 1 | 10 | 0 | 5651 |
| variable | missing | complete | n | n_unique |
|---|---|---|---|---|
| str_flag_fireplace | 2980054 | 5163 | 2985217 | 1 |
| str_flag_tub | 2935155 | 50062 | 2985217 | 1 |
| tax_delinquency | 2928702 | 56515 | 2985217 | 1 |
| zoning_landuse | 2932 | 2982285 | 2985217 | 16 |
| top_counts | ordered |
|---|---|
| NA: 2980054, No: 5163 | FALSE |
| NA: 2935155, No: 50062 | FALSE |
| NA: 2928702, Yes: 56515 | FALSE |
| Sin: 2152863, Con: 483789, Dup: 114415, Pla: 61559 | FALSE |
| variable | missing | complete | n | mean |
|---|---|---|---|---|
| area_base | 2963735 | 21482 | 2985217 | 2427.56 |
| area_basement | 2983590 | 1627 | 2985217 | 647.22 |
| area_firstfloor_finished_1 | 2781459 | 203758 | 2985217 | 1379.78 |
| area_firstfloor_finished_2 | 2781459 | 203758 | 2985217 | 1392.03 |
| area_garage | 2094209 | 891008 | 2985217 | 383.16 |
| area_living_finished | 264431 | 2720786 | 2985217 | 1764.04 |
| area_living_perimeter | 2977546 | 7671 | 2985217 | 1178.92 |
| area_patio | 2903629 | 81588 | 2985217 | 321.54 |
| area_pool | 2957259 | 27958 | 2985217 | 519.72 |
| area_shed | 2982571 | 2646 | 2985217 | 278.37 |
| area_total | 2795032 | 190185 | 2985217 | 2754.87 |
| id_parcel | 0 | 2985217 | 2985217 | 1.3e+07 |
| latitude | 2932 | 2982285 | 2985217 | 3.4e+07 |
| longitude | 2932 | 2982285 | 2985217 | -1.2e+08 |
| num_75_bath | 2668860 | 316357 | 2985217 | 1.01 |
| num_bath | 117156 | 2868061 | 2985217 | 2.25 |
| num_fireplace | 2672093 | 313124 | 2985217 | 1.17 |
| num_garage | 2094209 | 891008 | 2985217 | 1.83 |
| num_pool | 2445585 | 539632 | 2985217 | 1 |
| num_story | 2299541 | 685676 | 2985217 | 1.4 |
| num_unit | 1004175 | 1981042 | 2985217 | 1.18 |
| pooltypeid7 | 2479322 | 505895 | 2985217 | 1 |
| region_city | 62128 | 2923089 | 2985217 | 34987.66 |
| region_county | 2932 | 2982285 | 2985217 | 2569.09 |
| region_neighbor | 1828476 | 1156741 | 2985217 | 193538.7 |
| region_zip | 12714 | 2972503 | 2985217 | 96553.29 |
| str_aircon | 2169855 | 815362 | 2985217 | 1.95 |
| str_deck | 2967838 | 17379 | 2985217 | 66 |
| str_framing | 2972486 | 12731 | 2985217 | 3.73 |
| str_heating | 1116053 | 1869164 | 2985217 | 4.08 |
| str_quality | 1043822 | 1941395 | 2985217 | 6.28 |
| str_story | 2983594 | 1623 | 2985217 | 7 |
| tax_delinquency_year | 2928700 | 56517 | 2985217 | 13.89 |
| tax_year | 2933 | 2982284 | 2985217 | 2016 |
| sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|
| 7786.19 | 117 | 1072 | 2008 | 3411 | 952576 |
| 538.79 | 20 | 272 | 535 | 847.5 | 8516 |
| 634.42 | 1 | 1010 | 1281 | 1615 | 31303 |
| 682.32 | 3 | 1012 | 1284 | 1619 | 41906 |
| 246.22 | 0 | 312 | 441 | 494 | 7749 |
| 1031.38 | 1 | 1198 | 1542 | 2075 | 427079 |
| 357.09 | 120 | 960 | 1296 | 1440 | 2688 |
| 236.88 | 10 | 190 | 270 | 390 | 7983 |
| 191.33 | 19 | 430 | 495 | 594 | 17410 |
| 369.78 | 10 | 96 | 168 | 320 | 6141 |
| 5999.38 | 112 | 1696 | 2173 | 2975 | 820242 |
| 7909966.39 | 1.1e+07 | 1.2e+07 | 1.3e+07 | 1.4e+07 | 1.7e+08 |
| 243515.71 | 3.3e+07 | 3.4e+07 | 3.4e+07 | 3.4e+07 | 3.5e+07 |
| 345591.77 | -1.2e+08 | -1.2e+08 | -1.2e+08 | -1.2e+08 | -1.2e+08 |
| 0.12 | 1 | 1 | 1 | 1 | 7 |
| 0.99 | 1 | 2 | 2 | 3 | 32 |
| 0.46 | 1 | 1 | 1 | 1 | 9 |
| 0.61 | 0 | 2 | 2 | 2 | 25 |
| 0 | 1 | 1 | 1 | 1 | 1 |
| 0.54 | 1 | 1 | 1 | 2 | 41 |
| 2.49 | 1 | 1 | 1 | 1 | 997 |
| 0 | 1 | 1 | 1 | 1 | 1 |
| 50709.68 | 3491 | 12447 | 25218 | 45457 | 4e+05 |
| 788.68 | 1286 | 1286 | 3101 | 3101 | 3101 |
| 165725.27 | 6952 | 46736 | 118920 | 274800 | 764167 |
| 3680.82 | 95982 | 96180 | 96377 | 96974 | 4e+05 |
| 3.16 | 1 | 1 | 1 | 1 | 13 |
| 0 | 66 | 66 | 66 | 66 | 66 |
| 0.5 | 1 | 3 | 4 | 4 | 5 |
| 3.29 | 1 | 2 | 2 | 7 | 24 |
| 1.73 | 1 | 5 | 6 | 8 | 12 |
| 0 | 7 | 7 | 7 | 7 | 7 |
| 2.56 | 0 | 14 | 14 | 15 | 99 |
| 0.06 | 2000 | 2016 | 2016 | 2016 | 2016 |
| variable | missing | complete | n | mean |
|---|---|---|---|---|
| area_living_finished_calc | 45097 | 2940120 | 2985217 | 1831.46 |
| area_lot | 272706 | 2712511 | 2985217 | 22603.76 |
| build_year | 47833 | 2937384 | 2985217 | 1964.44 |
| censustractandblock | 74985 | 2910232 | 2985217 | 6e+13 |
| num_bathroom | 2957 | 2982260 | 2985217 | 2.22 |
| num_bathroom_calc | 117156 | 2868061 | 2985217 | 2.3 |
| num_bedroom | 2945 | 2982272 | 2985217 | 3.09 |
| num_room | 2969 | 2982248 | 2985217 | 1.47 |
| tax_building | 46464 | 2938753 | 2985217 | 178142.89 |
| tax_land | 59926 | 2925291 | 2985217 | 268455.77 |
| tax_property | 22752 | 2962465 | 2985217 | 5408.95 |
| tax_total | 34266 | 2950951 | 2985217 | 443527.93 |
| sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|
| 1954.2 | 1 | 1215 | 1574 | 2140 | 952576 |
| 249983.63 | 100 | 5683 | 7000 | 9893 | 3.7e+08 |
| 23.64 | 1801 | 1950 | 1963 | 1981 | 2016 |
| 3.2e+11 | -1 | 6e+13 | 6e+13 | 6.1e+13 | 4.8e+14 |
| 1.08 | 0 | 2 | 2 | 3 | 32 |
| 1 | 1 | 2 | 2 | 3 | 32 |
| 1.27 | 0 | 2 | 3 | 4 | 25 |
| 2.84 | 0 | 0 | 0 | 0 | 96 |
| 460050.31 | 1 | 77666 | 127066 | 2e+05 | 2.6e+08 |
| 486509.71 | 1 | 79700 | 176619 | 326100 | 9.4e+07 |
| 9675.57 | 0.24 | 2468.62 | 4007.62 | 6230.5 | 3823175.65 |
| 816336.63 | 1 | 188220 | 321161 | 514072 | 3.2e+08 |
3.2.1 Missingness
missing_data <- plot_missing(properties, theme = theme_bw())
Figure 3.10: Completeness by Feature. Many are extremely sparse
missing_data## # A tibble: 58 x 4
## feature num_missing pct_missing group
## <fct> <int> <dbl> <chr>
## 1 id_parcel 0 0 Good
## 2 str_aircon 2169855 0.727 Bad
## 3 str_arch_style 2979156 0.998 Remove
## 4 area_basement 2983590 0.999 Remove
## 5 num_bathroom 2957 0.000991 Good
## 6 num_bedroom 2945 0.000987 Good
## 7 str_framing 2972486 0.996 Remove
## 8 str_quality 1043822 0.350 OK
## 9 num_bathroom_calc 117156 0.0392 Good
## 10 str_deck 2967838 0.994 Remove
## # ... with 48 more rows
There seem to be quite a lot of missing features. For now lets remove the ones that are over 50% and continue on with those. We could come back to the ones we dropped and try to recover some of those missing values with more sophisticated methods, for example we could impute the missing values based on their spatial neighbors but for now we will continue with the ones that have over 50% of their values.
A few of the features, rawcensustractandblock, fips, and censustractandblock, and region_county are ID fields for their census geography units. Since we have already extracted that information earlier in properties_geo we will drop them here as well since we can add the information contained in those features in a cleaner format later.
Additionally, based on the descriptions, zoning_landuse, zoning_landuse_county, and zoning_property all seem to contain pretty similar information.Since the number of unique categories are fairly large for each one, if they are redundant they could add needless complexity and computation time to our model. Let’s use a chi-squared test to see what it looks like
chisq.test(properties$zoning_landuse, properties$zoning_property)##
## Pearson's Chi-squared test
##
## data: properties$zoning_landuse and properties$zoning_property
## X-squared = 4377000, df = 73450, p-value < 2.2e-16
chisq.test(properties$zoning_landuse, properties$zoning_landuse_county)##
## Pearson's Chi-squared test
##
## data: properties$zoning_landuse and properties$zoning_landuse_county
## X-squared = 37455000, df = 3262, p-value < 2.2e-16
Based on that, let’s remove zoning_property and zoning_landuse_county
features_to_keep <- missing_data %>%
filter(
pct_missing <= .50,
!feature %in% c("rawcensustractandblock", "fips",
"censustractandblock", "region_county",
"zoning_property", "zoning_landuse_county")
) %>%
select(feature) %>%
.$feature %>%
as.character()## Warning: package 'bindrcpp' was built under R version 3.4.4
properties <- properties %>%
select(features_to_keep)3.2.2 Numeric Features
Lets look at the histograms of all the numeric features
properties %>%
select(
-id_parcel
) %>%
plot_histogram(ggtheme = theme_bw(), fill = "red", alpha = 0.5)
Figure 3.11: Distriubtions of All Numeric Features
Figure 3.11: Distriubtions of All Numeric Features
Looking at the histograms a few things become obvious. There are huge outliers in many of the features and there are some features that are currently encoded as numeric but should not be treated as such. For example, str_quality is an ordinal scale 1 (best qaulity) to 12 (worst) but if we leave them as numeric they will be treated as ratio. str_heating is nominal so the order doesn’t have meaning. Other that need to be changed are region_city, region_zip Once we do this, we’ll look again at the relationships between our numeric features.
properties <- properties %>%
mutate(
str_quality = factor(str_quality,
levels = min(str_quality, na.rm = TRUE):max(str_quality, na.rm = TRUE),
ordered = TRUE),
str_heating = factor(str_heating,
levels = na.omit(unique(str_heating)),
ordered = FALSE),
region_city = factor(region_city,
levels = na.omit(unique(region_city)),
ordered = FALSE),
region_zip = factor(region_zip,
levels = na.omit(unique(region_zip)),
ordered = FALSE)
)3.2.3 Numeric Outliers
Based on the histograms there looks to be lots of outliers in many of our numeric features. Two groups of features pop out, the num_* features and the tax_*features. Let’s take a closer look.
properties %>%
select(starts_with("num_")) %>%
plot_histogram(ggtheme = theme_bw(), fill = "red", alpha = 0.5)
Figure 3.12: Distriubtions of ’num_*’ Features
Looking at the num_bathroom, num_bathroom_calc, num_bath is pretty interesting. num_bathroom was one of the most complete features we had however, looking at the distributions, it seems strange that there would be so many houses with 0 bathrooms.
sum(properties$num_bathroom == 0, na.rm = TRUE)## [1] 113470
sum(properties$num_bathroom_calc == 0, na.rm = TRUE)## [1] 0
Now for comparing all 3
properties %>%
group_by(
num_bathroom_calc,
num_bath,
num_bathroom
) %>%
count() %>%
DT::datatable()If you sort by descending by n you’ll see that one of the most frequent combinations is blank values of num_bathroom_calc and num_bath which are NA values and 0 for num_bathroom. Based on this I am interpreting that as either 0 being a coded value for NA or it just being wrong. Either way it looks like num_bathroom_calc is the one to keep out of all 3, since it has calculations of half-baths as well.
Applying the same logic to num_room and num_bedroom we can set all values equal to 0 to NA. One side effect of this is that the num_room feature is now almost 100% missing and not very useful anymore. So we will just remove it.
Quickly looking at area_living_finished_calc and area_living_finished reveals a similar *_calc being a corrected version of the feature. Becasue of this we will go ahead and remove area_living_finished as well
properties <- properties %>%
select(
-num_bath,
-num_bathroom,
-num_room,
-area_living_finished
) %>%
mutate(
num_bedroom = ifelse(num_bedroom == 0, NA, num_bedroom)
)Now let’s look at the tax related features
properties %>%
select(starts_with("tax_")) %>%
plot_histogram(ggtheme = theme_bw(), fill = "red", alpha = 0.5)
Figure 3.13: Distriubtions of ’tax_*’ Features
Lets look at the highest values for tax_total and see if something jumps out
properties %>%
mutate(tax_rank = rank(desc(tax_total))) %>%
filter(tax_rank <= 20) %>%
select(
zoning_landuse,
starts_with("area_"),
starts_with("tax_")
) %>%
arrange(tax_rank) %>%
DT::datatable()While the values are extremely large, they appear to look legitimate. We won’t remove these, but it does indicate that we should perhaps apply some transformations to our tax features before we start applying our model.
Now a look at the relationships between our remaining numeric features
library(heatmaply)
properties %>%
select(-id_parcel) %>%
select_if(is.numeric) %>%
cor(use = "pairwise.complete.obs") %>%
heatmaply_cor()Figure 3.14: Correlation of Numeric Features
3.2.4 Categorical Features
plot_bar(properties, ggtheme = theme_bw())## 2 columns ignored with more than 50 categories.
## region_city: 187 categories
## region_zip: 404 categories
Figure 3.15: Distriubtions of All Categorical Features
The distribution across categories are extremely non-uniform, especially str_heating and zoning_landuse. This imbalance could cause use some pain later one when trying to fit our model. One way we can avoid some of this pain is by collapsing some of the rare categories into an other category. The number of categories we collapse to is not a hard and fast decision, it can be based on number of observations, subject matter expertise, heterogentity of the response variable within categories, or some mix of all of these).
Let’s look at what the distribution of log_error looks like across these categories.
library(ggridges)
library(forcats)
properties %>%
select(
id_parcel,
str_quality
) %>%
right_join(trans, by = "id_parcel") %>%
ggplot(aes(x = log_error, y = fct_reorder(str_quality, log_error), fill = factor(..quantile..))) +
stat_density_ridges(
geom = "density_ridges_gradient",
calc_ecdf = TRUE,
quantiles = c(0.05, 0.95)
) +
scale_fill_manual(
name = "Probability",
values = c("#FF0000A0", "#A0A0A0A0", "#0000FFA0"),
labels = c("(0, 0.05]", "(0.05, 0.95]", "(0.95, 1]")
) +
xlim(c(-0.5, 0.5)) +
theme_bw() +
labs(
y = "str_quality"
)
Figure 3.16: Distribution of Log Error Across Structure Quality Feature
library(ggridges)
properties %>%
select(
id_parcel,
str_heating
) %>%
right_join(trans, by = "id_parcel") %>%
ggplot(aes(x = log_error, y = fct_reorder(str_heating, log_error), fill = factor(..quantile..))) +
stat_density_ridges(
geom = "density_ridges_gradient",
calc_ecdf = TRUE,
quantiles = c(0.05, 0.95)
) +
scale_fill_manual(
name = "Probability",
values = c("#FF0000A0", "#A0A0A0A0", "#0000FFA0"),
labels = c("(0, 0.05]", "(0.05, 0.95]", "(0.95, 1]")
) +
xlim(c(-0.5, 0.5)) +
theme_bw() +
labs(
y = "str_heating"
)
Figure 3.17: Distribution of Log Error Across Heating Type Feature
library(ggridges)
properties %>%
select(
id_parcel,
zoning_landuse
) %>%
right_join(trans, by = "id_parcel") %>%
ggplot(aes(x = log_error, y = fct_reorder(zoning_landuse, log_error), fill = factor(..quantile..))) +
stat_density_ridges(
geom = "density_ridges_gradient",
calc_ecdf = TRUE,
quantiles = c(0.05, 0.95)
) +
scale_fill_manual(
name = "Probability",
values = c("#FF0000A0", "#A0A0A0A0", "#0000FFA0"),
labels = c("(0, 0.05]", "(0.05, 0.95]", "(0.95, 1]")
) +
xlim(c(-0.5, 0.5)) +
theme_bw() +
labs(
y = "zoning_landuse"
)
Figure 3.18: Distribution of Log Error Across Zoning Feature
Since the distributions of log_error within each category seems well behaved, we will recode them based on number of observations
properties <- properties %>%
mutate(
str_heating = fct_lump(str_heating, n = 6),
zoning_landuse = fct_lump(zoning_landuse, n = 8),
str_heating = fct_recode(str_heating,
"Central" = "2",
"Floor/Wall" = "7",
"Solar" = "20",
"Forced Air" = "6",
"Yes - Type Unknown" = "24",
"None" = "13"
)
)3.3 Exploring log_error A little More
Now let’s join the properties and properties_geo tables to our trans table of tranactions and their log_error’s and explore those
trans_prop <- read_feather("data/properties_geo_only.feather") %>%
right_join(trans, by = "id_parcel") %>%
left_join(properties, by = "id_parcel")trans_prop %>%
group_by(id_geo_bg_fips, id_geo_county_name) %>%
summarise(
n = n(),
mean_abs_error = mean(abs_log_error)
) %>%
ungroup() %>%
mutate(
trans_pert = cut(n, breaks = c(seq(0, 100, 10), 350))
) %>%
ggplot(aes(x = trans_pert, y = mean_abs_error, colour = id_geo_county_name)) +
geom_boxplot(outlier.size = 1.5, outlier.alpha = 1/3) +
theme_bw() +
labs(
subtitle = "Block Group Average Mean Absolute Error",
colour = NULL,
x = "Number of Total Transactions per Block Group",
y = "Mean Absolute Log Error"
)
Figure 3.19: Outliers and Variability of Mean Absolute Error Dreceases When Neighborhood Sales Increase
It looks like Los Angeles is largerly the only county that has information populated for str_qaulity
trans_prop %>%
ggplot(aes(x = str_quality, y = log_error, colour = id_geo_county_name)) +
geom_boxplot(outlier.size = 1.5, outlier.alpha = 1/3) +
theme_bw() +
labs(
colour = NULL
)
Figure 3.20: Log Error by Structure Quality
library(ggmap)
trans_prop_tmp <- trans_prop %>%
filter(!is.na(id_geo_county_name)) %>%
group_by(
id_parcel,
id_geo_county_name
) %>%
mutate(
log_error_parcel_avg = mean(log_error)
) %>%
ungroup() %>%
mutate(
outlier = ifelse(log_error < quantile(log_error, probs = .1) |
log_error > quantile(log_error, probs = .9),
"Outlier", "Normal")
)
error_map <- get_map(location = "Los Angeles, CA",
color="bw",
crop = FALSE,
zoom = 9)
ggmap(error_map) +
stat_density2d(
data = trans_prop_tmp,
aes(x = lon, y = lat,
fill = ..level..,
alpha = ..level..),
geom = "polygon",
size = 0.001,
bins = 100
) +
scale_fill_viridis_c() +
scale_alpha(range = c(0.05, 0.95), guide = FALSE) +
facet_wrap(~outlier)
Figure 3.21: Spatial Distribution of Log Error Outliers
Now lets look at the spatiotemporal distribution of log_error outliers
trans_prop %>%
filter(
!is.na(lat),
(
log_error <= quantile(log_error, probs = .1) |
log_error >= quantile(log_error, probs = .9)
)
) %>%
mutate(
lon = round(lon/0.5, digits = 1) * 0.5,
lat = round(lat/0.5, digits = 1) * 0.5
) %>%
group_by(lon, lat, month_year) %>%
summarise(
n = n()
) %>%
ggplot(aes(lon, lat)) +
geom_raster(aes(fill = n)) +
scale_fill_viridis_c() +
facet_wrap(~month_year, ncol = 3) +
coord_quickmap() +
theme_dark() +
labs(
subtitle = "Downtown Los Angeles looks to be consistently bad",
fill = "Count"
) +
theme(
axis.text = element_text(size = 5)
)
Figure 3.22: SpatioTemporal Distribution of Log Error Outliers
At first glance there looks to be a strong spatial and temporal correlation to log_error. Let’s look more into the spatial correlation
library(spatstat)
library(spdep)
d <- trans_prop %>%
filter(
!is.na(lat),
(
log_error <= quantile(log_error, probs = .1) |
log_error >= quantile(log_error, probs = .9)
)
) %>%
mutate(
lon = round(lon/0.1, digits = 1) * 0.1,
lat = round(lat/0.1, digits = 1) * 0.1
) %>%
group_by(lon, lat) %>%
summarise(
n = n()
)
coordinates(d) <- ~lon + lat
w <- knn2nb(knearneigh(d, k = 10, longlat = TRUE))
moran.test(d$n, nb2listw(w))##
## Moran I test under randomisation
##
## data: d$n
## weights: nb2listw(w)
##
## Moran I statistic standard deviate = 71.112, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 4.308120e-01 -1.952362e-04 3.673489e-05
local_moran <- as.data.frame(localmoran(d$n, nb2listw(w)))
d %>%
as.data.frame() %>%
cbind(local_moran) %>%
ggplot(aes(lon, lat)) +
geom_raster(aes(fill = Ii)) +
scale_fill_viridis_c() +
coord_quickmap() +
theme_dark() +
labs(
title = "Local Moran's I on Outliers Density",
fill = "Local Moran's I"
)
Figure 3.23: Spatial Autocorrelation of Log Error Outliers
Let’s save our pared down properties table and then get into feature engineering
write_feather(properties, "data/properties_17_filtered.feather")